InĀ [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statistics
import seaborn as sns
def print_full_dataframe(df):
with pd.option_context('display.max_columns', None, 'display.expand_frame_repr', False):
print(df)
df = pd.read_csv('master.csv')
df2 = pd.read_csv('share-with-anxiety-disorders.csv')
df3 = pd.read_csv('Countries_GDP_1960-2020.csv')
df4 = pd.read_csv('DP_LIVE_13042023005821788.csv', encoding='latin1')
print("df.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df.isnull().sum())
print()
print()
print()
print()
print("df2.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df2.isnull().sum())
print()
print()
print()
print()
print("df3.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
null_values_df = df3.isnull().sum().to_frame().T
print_full_dataframe(null_values_df)
print()
print()
print()
print()
print("df4.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df4.isnull().sum())
print()
print()
print()
print()
print("df")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df)
print()
print()
print()
print()
print("df2")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df2)
print()
print()
print()
print()
print("df3")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df3)
print()
print()
print()
print()
print("df4")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df4)
print()
print()
print()
print()
#Start filtering the Dataframes
df['HDI for year'].fillna(df['HDI for year'].mean(), inplace=True)# convert the ' gdp_for_year ($) ' column to a numeric type
df[' gdp_for_year ($) '] = df[' gdp_for_year ($) '].str.replace(',', '').astype(float)
filtered_df = df[(df['year'] >= 2000) & (df['year'] <= 2015)]
df2_filtered = df2[(df2['Year'] >= 2000) & (df2['Year'] <= 2015)]
df2_filtered = df2_filtered.drop('Code', axis=1)
years_to_keep = [str(year) for year in range(2000, 2016)] # List of strings containing years from 2000 to 2015
columns_to_keep = ['Country Name', 'Country Code'] + years_to_keep
df3_filtered = df3[columns_to_keep]
df4_filtered = df4[(df4['TIME'] >= 2000) & (df4['TIME'] <= 2015)]
df4_filtered = df4_filtered.rename(columns={"\"LOCATION\"": "LOCATION"})
df4_filtered = df4_filtered.drop('Flag Codes', axis=1)
df4_filtered = df4_filtered.drop('SUBJECT',axis=1)
df4_filtered = df4_filtered.drop('MEASURE',axis=1)
df4_filtered = df4_filtered.drop('FREQUENCY',axis=1)
df4_filtered = df4_filtered.drop('INDICATOR',axis=1)
print("filtered_df.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(filtered_df.isnull().sum())
filtered_df.to_csv('filtered_df.csv', index=False)
print()
print()
print()
print()
print("df2_filtered.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df2_filtered.isnull().sum())
df2_filtered.to_csv('df2_filtered.csv', index=False)
print()
print()
print()
print()
print("df3_filtered.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
null_values_df = df3_filtered.isnull().sum().to_frame().T
print_full_dataframe(null_values_df)
df3_filtered.to_csv('df3_filtered.csv', index=False)
print()
print()
print()
print()
print("df4_filtered.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df4_filtered.isnull().sum())
df4_filtered.to_csv('df4_filtered.csv', index=False)
print()
print()
print()
print()
print("filtered_df")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(filtered_df)
print()
print()
print()
print()
print("df2_filtered")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df2_filtered)
print()
print()
print()
print()
print("df3_filtered")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df3_filtered)
print()
print()
print()
print()
print("df4_filtered")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df4_filtered)
print()
print()
print()
print()
average_suicide_rates_by_country = filtered_df.groupby('country')['suicides/100k pop'].mean().sort_values(ascending=False)
print("average_suicide_rates_by_country = filtered_df.groupby('country')['suicides/100k pop'].mean().sort_values(ascending=False)")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_country)
print()
print()
print()
print()
average_suicide_rates_by_country_and_year = filtered_df.groupby(['year','country'])['suicides/100k pop'].mean()
#average_suicide_rates_by_country_and_year.to_csv('average_suicide_rates_by_country_and_year.csv', index=True)
print("average_suicide_rates_by_country_and_year = filtered_df.groupby(['year','country'])['suicides/100k pop'].mean().sort_values(ascending=False)")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_country_and_year)
print()
print()
print()
print()
average_suicide_rates_by_gender = filtered_df.groupby('sex')['suicides/100k pop'].mean()
print("average_suicide_rates_by_gender = filtered_df.groupby('sex')['suicides/100k pop'].mean()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_gender)
print()
print()
print()
print()
average_suicide_rates_by_gender_and_year = filtered_df.groupby(['year','sex'])['suicides/100k pop'].mean()
#average_suicide_rates_by_gender_and_year.to_csv('average_suicide_rates_by_gender_and_year.csv', index=True)
print("average_suicide_rates_by_gender_and_year = filtered_df.groupby(['year','sex'])['suicides/100k pop'].mean()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_gender_and_year)
print()
print()
print()
print()
average_suicide_rates_by_age = filtered_df.groupby('age')['suicides/100k pop'].mean().sort_values(ascending=False)
print("average_suicide_rates_by_age = filtered_df.groupby('age')['suicides/100k pop'].mean().sort_values(ascending=False)")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_age)
print()
print()
print()
print()
average_suicide_rates_by_age_and_year = filtered_df.groupby(['year','age'])['suicides/100k pop'].mean()
#average_suicide_rates_by_age_and_year.to_csv('average_suicide_rates_by_age_and_year.csv', index=True)
print("average_suicide_rates_by_age_and_year = filtered_df.groupby(['year','age'])['suicides/100k pop'].mean().sort_values(ascending=False)")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_age_and_year)
print()
print()
print()
print()
#calculate the correlation coefficients
correlations = filtered_df[['suicides/100k pop', 'gdp_per_capita ($)', ' gdp_for_year ($) ']].corr()
print("correlations = filtered_df[['suicides/100k pop', 'gdp_per_capita ($)', ' gdp_for_year ($) ']].corr()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(correlations)
print()
print()
print()
print()
average_suicide_rates_by_generation = filtered_df.groupby('generation')['suicides/100k pop'].mean().sort_values(ascending=False)
print("average_suicide_rates_by_generation = filtered_df.groupby('generation')['suicides/100k pop'].mean().sort_values(ascending=False)")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_generation)
print()
print()
print()
print()
average_suicide_rates_by_generation_and_year = filtered_df.groupby(['year', 'generation'])['suicides/100k pop'].mean()
#average_suicide_rates_by_generation_and_year.to_csv('average_suicide_rates_by_generation_and_year.csv', index=True)
print("average_suicide_rates_by_generation_and_year = filtered_df.groupby(['year', 'generation'])['suicides/100k pop'].mean().sort_values(ascending=False)")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_generation_and_year)
print()
print()
print()
print()
average_suicide_rates_by_year = filtered_df.groupby('year')['suicides/100k pop'].mean()
print("average_suicide_rates_by_year = filtered_df.groupby('year')['suicides/100k pop'].mean()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_year)
print()
print()
print()
print()
prevalence_by_country = df2_filtered.groupby(['Entity', 'Year'])['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()
print("prevalence_by_country = df2_filtered.groupby(['Entity', 'Year'])['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(prevalence_by_country)
print()
print()
print()
print()
average_prevalence_by_country = df2_filtered.groupby('Entity')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()
print("average_prevalence_by_country = df2_filtered.groupby('Entity')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_prevalence_by_country)
print()
print()
print()
print()
mean_prevalence = df2_filtered.groupby('Year')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()
print("mean_prevalence = df2_filtered.groupby('Year')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(mean_prevalence)
print()
print()
print()
print()
median_prevalence = df2_filtered.groupby('Year')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].median()
print("median_prevalence = df2_filtered.groupby('Year')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].median()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(median_prevalence)
print()
print()
print()
print()
total_gdp = df3_filtered.iloc[:, 2:].sum(axis=0)
print("total_gdp = df3_filtered.iloc[:, 2:].sum(axis=0)")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(total_gdp)
print()
print()
print()
print()
average_gdp = df3_filtered.iloc[:, 2:].mean(axis=0)
print("average_gdp = df3_filtered.iloc[:, 2:].mean(axis=0)")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_gdp)
print()
print()
print()
print()
mean_unemployment = df4_filtered.groupby('TIME')['Value'].mean()
print("mean_unemployment = df4_filtered.groupby('TIME')['Value'].mean()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(mean_unemployment)
print()
print()
print()
print()
median_unemployment = df4_filtered.groupby('TIME')['Value'].median()
print("median_unemployment = df4_filtered.groupby('TIME')['Value'].median()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(median_unemployment)
print()
print()
print()
print()
# Rename columns in df2_filtered, df3_filtered, and df4_filtered to match the common columns in filtered_df
df2_filtered.rename(columns={'Entity': 'country', 'Year': 'year'}, inplace=True)
df3_filtered.rename(columns={'Country Name': 'country', 'Country Code': 'country_code'}, inplace=True)
df4_filtered.rename(columns={'LOCATION': 'country_code', 'TIME': 'year', 'Value': 'value'}, inplace=True)
filtered_df.to_csv('filtered_df.csv', index=False)
df2_filtered.to_csv('df2_filtered.csv', index=False)
df3_filtered.to_csv('df3_filtered.csv', index=False)
df4_filtered.to_csv('df4_filtered.csv', index=False)
# Merge filtered_df with df2_filtered
merged_df = pd.merge(filtered_df, df2_filtered, on=['country', 'year'], how='left')#first merge
merged_df.to_csv('merged_df.csv', index=False)
print("merged_df = filtered_df.merge(df2_filtered, on=['country', 'year'], how='left')")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(merged_df)
print()
print()
print()
print()
# Merge the resulting DataFrame with df3_filtered
# Note: df3_filtered has data in wide format, with years as columns. We need to convert it to long format before merging
df3_long = pd.melt(df3_filtered, id_vars=['country', 'country_code'], var_name='year', value_name='gdp')
# Convert 'year' column to integer type
df3_long['year'] = df3_long['year'].astype(int)
df3_long.to_csv('df3_long.csv', index=False)
merged_df = pd.merge(merged_df, df3_long, on=['country', 'year'], how='left')#second merge
merged_df.to_csv('merged_df2.csv', index=False)
print("merged_df = merged_df.merge(df3_long, on=['country', 'year'], how='left')")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(merged_df)
print()
print()
print()
print()
# Merge the resulting DataFrame with df4_filtered
merged_df = pd.merge(merged_df, df4_filtered, left_on=['country_code', 'year'], right_on=['country_code', 'year'], how='left')#last merge
merged_df.dropna(inplace=True)
merged_df = merged_df.drop('country-year',axis=1)
merged_df = merged_df.drop('HDI for year',axis=1)
merged_df = merged_df.drop('gdp',axis=1)
merged_df = merged_df.drop('country_code',axis=1)
merged_df.to_csv('merged_df3.csv', index=False)
print("merged_df = merged_df.merge(df4_filtered, on=['country', 'year'], how='left')")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(merged_df)
print()
print()
print()
print()
print("merged_df.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(merged_df.isnull().sum())
print()
print()
print()
print()
print("merged_df.describe(include='all')")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(merged_df.describe(include='all'))
print()
print()
print()
print()
grouped_data = merged_df.groupby(['country', 'year']).agg({
'suicides_no': 'sum',
'population': 'sum',
' gdp_for_year ($) ': 'first',
'gdp_per_capita ($)': 'first',
'Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)': 'first',
'value': 'first'
}).reset_index()
grouped_data = grouped_data.rename(columns={"Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)": "Anxiety Prevalence (%)"})
grouped_data = grouped_data.rename(columns={"value": "Unemployment Rate(%)"})
# Calculate the new 'suicides/100k pop' rate for each group
grouped_data['suicides/100k pop'] = (grouped_data['suicides_no'] / grouped_data['population']) * 100000
# Write the result to a new CSV file
grouped_data.to_csv('transformed_data.csv', index=False)
print("grouped_data = merged_df.groupby(['country', 'year']).agg")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(grouped_data)
print()
print()
print()
print()
print("grouped_data.describe(include='all')")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(grouped_data.describe(include='all'))
print()
print()
print()
print()
tempDef2 = df4_filtered[(df4_filtered['country_code'] == 'USA')]
npArray = tempDef2.to_numpy()
npArray1 = []
for x in npArray:
npArray1.append(x[2])
npArray1 = np.array(npArray1)
npArray1
Mean = np.mean(npArray1)
Median = np.median(npArray1)
Std = np.std(npArray1)
q1 = np.percentile(npArray1,25)
q3 = np.percentile(npArray1, 75)
IQR = q3 - q1
data = {}
data['Mean'] = Mean
data['Median'] = Median
data['Std'] = Std
data['IQR'] = IQR
Stats = list(data.keys())
values = list(data.values())
fig = plt.figure(figsize= (10,5))
plt.bar(Stats,values,color = 'blue',width = 0.4)
# - - - - - - - - - - - - - - - - - - - - - - - - - -
plt.title("Unemployment")
plt.ylabel('Value')
plt.xlabel('Statistic')
plt.show()
###############################################################################################
#check for NaN or null values
values = df3_filtered.isna()
count = values.sum()
usa_data = df3_filtered.loc[df3_filtered['country'] == 'United States', '2000':'2015']
usa_data = np.array(usa_data)
# for x in usa_data:
# print(x)
Mean = np.mean(usa_data)
Mean1 = np.round(Mean,2)
print(Mean1)
Median = np.median(usa_data)
Median1 = np.round(Median,2)
print(Median1)
std = np.std(usa_data)
std1 = np.round(std,2)
print(std1)
q1 = np.percentile(usa_data,25)
q3 = np.percentile(usa_data, 75)
IQR = q3 - q1
IQR1 = np.round(IQR,2)
print(IQR1)
data = {}
data['Mean'] = Mean1
data['Median'] = Median1
data['Std'] = std1
data['IQR'] = IQR1
Stats = list(data.keys())
values = list(data.values())
fig = plt.figure(figsize= (10,5))
plt.bar(Stats,values,color = 'blue',width = 0.4)
# - - - - - - - - - - - - - - - - - - - - - - - - - -
plt.title("GDP")
plt.ylabel('Value')
plt.xlabel('Statistic')
plt.show()
#####################################################################################################################
values = df2_filtered.isna()
count = values.sum()
usa_data = df2_filtered.loc[(df2_filtered['country'] == 'United States')]
#print(usa_data)
#---------------------------------------------------
anxiety_disorders = usa_data['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)']
stat = np.round(anxiety_disorders.describe(), 2)
median = np.median(stat)
print(stat)
print(median)
#------------------------------------------------------------
# Create a box plot to visualize the statistics
# Create a bar graph to visualize the statistics
fig, ax = plt.subplots()
ax.bar(stat.index, stat.values)
ax.set_title('Descriptive statistics for anxiety disorders(USA)')
ax.set_xlabel('Statistic')
ax.set_ylabel('Value')
plt.show()
#####################################################################################################################
#check for NaN or null values
values = filtered_df.isna()
count = values.sum()
#print(count)
#delete NaN columms
filtered_df = filtered_df.dropna(axis=1, how='any')
drop_column= [' gdp_for_year ($) ', 'gdp_per_capita ($)', 'country-year']
filtered_df = filtered_df.drop(drop_column, axis=1)
#print(df)
#-----------------------------------------------------
numbers_suicide = filtered_df['suicides_no']
stat = np.round(numbers_suicide.describe(), 2)
median = np.median(numbers_suicide)
print(stat)
print(median)
#------------------------------------------------------------
# Create a box plot to visualize the statistics
# Create a bar graph to visualize the statistics
fig, ax = plt.subplots()
ax.bar(stat.index, stat.values)
ax.set_title('Descriptive statistics for suicides_no')
ax.set_xlabel('Statistic')
ax.set_ylabel('Value')
plt.show()
###################################################################################################################
values = merged_df.isna()
count = values.sum()
Australia_data = merged_df.loc[(merged_df['country'] == 'Australia')]
#print(usa_data)
#---------------------------------------------------
anxiety_disorders = Australia_data['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)']
stat = np.round(anxiety_disorders.describe(), 2)
median = np.median(stat)
print(stat)
print(median)
#------------------------------------------------------------
# Create a box plot to visualize the statistics
# Create a bar graph to visualize the statistics
fig, ax = plt.subplots()
ax.bar(stat.index, stat.values)
ax.set_title('Descriptive statistics for anxiety disorders(Australia)')
ax.set_xlabel('Statistic')
ax.set_ylabel('Value')
plt.show()
# Create a figure with multiple subplots
fig, axs = plt.subplots(6, 2, figsize=(20, 30))
# Plot 1: Top N average suicide rates by country
top_n = 15
top_countries = average_suicide_rates_by_country.head(top_n)
axs[0, 0].barh(top_countries.index, top_countries.values)
axs[0, 0].invert_yaxis()
axs[0, 0].set_title(f'Top {top_n} Average Suicide Rates by Country')
axs[0, 0].set_xlabel('Suicides per 100k Population')
# Plot 2: Average suicide rates by gender
axs[0, 1].bar(average_suicide_rates_by_gender.index, average_suicide_rates_by_gender.values)
axs[0, 1].set_title('Average Suicide Rates by Gender')
axs[0, 1].set_ylabel('Suicides per 100k Population')
# Plot 3: Average suicide rates by age
axs[1, 0].bar(average_suicide_rates_by_age.index, average_suicide_rates_by_age.values)
axs[1, 0].set_title('Average Suicide Rates by Age')
axs[1, 0].set_ylabel('Suicides per 100k Population')
axs[1, 0].set_xticklabels(average_suicide_rates_by_age.index, rotation=45, ha='right')
# Plot 4: Average suicide rates by generation
# For average_suicide_rates_by_generation_and_year
avg_suicide_rates_by_gen_and_year = average_suicide_rates_by_generation_and_year.reset_index()
avg_suicide_rates_by_gen_and_year['year'] = pd.to_numeric(avg_suicide_rates_by_gen_and_year['year'])
axs[1, 1].bar(average_suicide_rates_by_generation.index, average_suicide_rates_by_generation.values)
axs[1, 1].set_title('Average Suicide Rates by Generation')
axs[1, 1].set_ylabel('Suicides per 100k Population')
axs[1, 1].set_xticklabels(average_suicide_rates_by_generation.index, rotation=45, ha='right')
# Plot 5: Suicide rates over time
axs[2, 0].plot(average_suicide_rates_by_year.index, average_suicide_rates_by_year.values)
axs[2, 0].set_xlabel('Year')
axs[2, 0].set_ylabel('Average Suicide Rate per 100k Population')
axs[2, 0].set_title('Suicide Rates Over Time')
# Plot 6: Average suicide rates by country and year
top_n_year = 5
# Find the top 15 countries based on average suicide rates
top_n_countries = average_suicide_rates_by_country_and_year.groupby('country').mean().nlargest(top_n_year).index
# Filter average_suicide_rates_by_country_and_year to only include the top 15 countries
top_n_average_suicide_rates_by_country_and_year = average_suicide_rates_by_country_and_year[average_suicide_rates_by_country_and_year.index.get_level_values('country').isin(top_n_countries)]
# For average_suicide_rates_by_country_and_year (Top 15 countries)
top_n_average_suicide_rates_by_country_and_year = top_n_average_suicide_rates_by_country_and_year.reset_index()
top_n_average_suicide_rates_by_country_and_year['year'] = pd.to_numeric(top_n_average_suicide_rates_by_country_and_year['year'])
for country, data in top_n_average_suicide_rates_by_country_and_year.groupby('country'):
axs[2, 1].plot(data['year'], data['suicides/100k pop'], label=country)
axs[2, 1].set_xlabel('Year')
axs[2, 1].set_ylabel('Suicides per 100k Population')
axs[2, 1].set_title('Average Suicide Rates by Country and Year (Top 5 Countries)')
axs[2, 1].legend(title='Country', bbox_to_anchor=(1, 1))
# Plot 7: Average suicide rates by gender and year
average_suicide_rates_by_gender_and_year.unstack().plot(ax=axs[3, 0])
axs[3, 0].set_ylabel('Suicides per 100k Population')
axs[3, 0].set_title('Average Suicide Rates by Gender and Year')
axs[3, 0].legend(title='Gender', loc='upper right')
# Plot 8: Average suicide rates by age and year
average_suicide_rates_by_age_and_year.unstack().plot(ax=axs[3, 1])
axs[3, 1].set_ylabel('Suicides per 100k Population')
axs[3, 1].set_title('Average Suicide Rates by Age and Year')
axs[3, 1].legend(title='Age Group', loc='upper right')
#Plot 9: Top N average anxiety disorders prevalence by country
top_n = 15
top_n_prevalence = average_prevalence_by_country.sort_values(ascending=False).head(top_n)
axs[4, 0].barh(top_n_prevalence.index, top_n_prevalence.values)
axs[4, 0].invert_yaxis()
axs[4, 0].set_xlabel('Anxiety Disorders Prevalence (%)')
axs[4, 0].set_title('Top 15 Average Anxiety Disorders Prevalence by Country')
#Plot 10: Anxiety disorders prevalence over time
axs[4, 1].plot(mean_prevalence.index, mean_prevalence.values, label='Mean Prevalence')
axs[4, 1].plot(median_prevalence.index, median_prevalence.values, label='Median Prevalence')
axs[4, 1].set_xlabel('Year')
axs[4, 1].set_ylabel('Prevalence of Anxiety Disorders (%)')
axs[4, 1].set_title('Anxiety Disorders Prevalence Over Time')
axs[4, 1].legend()
#Plot 11: Average GDP over time
axs[5, 0].bar(average_gdp.index, average_gdp.values, label='Average GDP')
axs[5, 0].set_xlabel('Year')
axs[5, 0].set_ylabel('GDP (USD)')
axs[5, 0].set_title('Average GDP Over Time')
axs[5, 0].legend()
#Plot 12: Unemployment rate over time
axs[5, 1].plot(mean_unemployment.index, mean_unemployment.values, label='Mean Unemployment Rate')
axs[5, 1].plot(median_unemployment.index, median_unemployment.values, label='Median Unemployment Rate')
axs[5, 1].set_xlabel('Year')
axs[5, 1].set_ylabel('Unemployment Rate (%)')
axs[5, 1].set_title('Unemployment Rate Over Time')
axs[5, 1].legend()
#Adjust layout and display the plots
fig.tight_layout()
plt.show()
df.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
country 0
year 0
sex 0
age 0
suicides_no 0
population 0
suicides/100k pop 0
country-year 0
HDI for year 19456
gdp_for_year ($) 0
gdp_per_capita ($) 0
generation 0
dtype: int64
df2.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
Entity 0
Code 690
Year 0
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent) 0
dtype: int64
df3.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
Country Name Country Code 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
df4.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
"LOCATION" 0
INDICATOR 0
SUBJECT 0
MEASURE 0
FREQUENCY 0
TIME 0
Value 0
Flag Codes 586
dtype: int64
df
-------------------------------------------------------------------------------------------------------------------------------------------------------
country year sex age suicides_no population \
0 Albania 1987 male 15-24 years 21 312900
1 Albania 1987 male 35-54 years 16 308000
2 Albania 1987 female 15-24 years 14 289700
3 Albania 1987 male 75+ years 1 21800
4 Albania 1987 male 25-34 years 9 274300
... ... ... ... ... ... ...
27815 Uzbekistan 2014 female 35-54 years 107 3620833
27816 Uzbekistan 2014 female 75+ years 9 348465
27817 Uzbekistan 2014 male 5-14 years 60 2762158
27818 Uzbekistan 2014 female 5-14 years 44 2631600
27819 Uzbekistan 2014 female 55-74 years 21 1438935
suicides/100k pop country-year HDI for year gdp_for_year ($) \
0 6.71 Albania1987 NaN 2,156,624,900
1 5.19 Albania1987 NaN 2,156,624,900
2 4.83 Albania1987 NaN 2,156,624,900
3 4.59 Albania1987 NaN 2,156,624,900
4 3.28 Albania1987 NaN 2,156,624,900
... ... ... ... ...
27815 2.96 Uzbekistan2014 0.675 63,067,077,179
27816 2.58 Uzbekistan2014 0.675 63,067,077,179
27817 2.17 Uzbekistan2014 0.675 63,067,077,179
27818 1.67 Uzbekistan2014 0.675 63,067,077,179
27819 1.46 Uzbekistan2014 0.675 63,067,077,179
gdp_per_capita ($) generation
0 796 Generation X
1 796 Silent
2 796 Generation X
3 796 G.I. Generation
4 796 Boomers
... ... ...
27815 2309 Generation X
27816 2309 Silent
27817 2309 Generation Z
27818 2309 Generation Z
27819 2309 Boomers
[27820 rows x 12 columns]
df2
-------------------------------------------------------------------------------------------------------------------------------------------------------
Entity Code Year \
0 Afghanistan AFG 1990
1 Afghanistan AFG 1991
2 Afghanistan AFG 1992
3 Afghanistan AFG 1993
4 Afghanistan AFG 1994
... ... ... ...
6835 Zimbabwe ZWE 2015
6836 Zimbabwe ZWE 2016
6837 Zimbabwe ZWE 2017
6838 Zimbabwe ZWE 2018
6839 Zimbabwe ZWE 2019
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)
0 4.84
1 4.82
2 4.80
3 4.79
4 4.78
... ...
6835 3.32
6836 3.32
6837 3.33
6838 3.32
6839 3.28
[6840 rows x 4 columns]
df3
-------------------------------------------------------------------------------------------------------------------------------------------------------
Country Name Country Code 1960 1961 \
0 Africa Eastern and Southern AFE 1.931311e+10 1.972349e+10
1 Africa Western and Central AFW 1.040428e+10 1.112805e+10
2 Australia AUS 1.860679e+10 1.968306e+10
3 Austria AUT 6.592694e+09 7.311750e+09
4 Burundi BDI 1.960000e+08 2.030000e+08
.. ... ... ... ...
115 St. Vincent and the Grenadines VCT 1.306656e+07 1.399988e+07
116 World WLD 1.390000e+12 1.440000e+12
117 South Africa ZAF 7.575397e+09 7.972997e+09
118 Zambia ZMB 7.130000e+08 6.962857e+08
119 Zimbabwe ZWE 1.052990e+09 1.096647e+09
1962 1963 1964 1965 1966 \
0 2.149392e+10 2.573321e+10 2.352744e+10 2.681057e+10 2.915216e+10
1 1.194335e+10 1.267652e+10 1.383858e+10 1.486247e+10 1.583285e+10
2 1.992272e+10 2.153993e+10 2.380110e+10 2.597715e+10 2.730989e+10
3 7.756110e+09 8.374175e+09 9.169984e+09 9.994071e+09 1.088768e+10
4 2.135000e+08 2.327500e+08 2.607500e+08 1.589950e+08 1.654446e+08
.. ... ... ... ... ...
115 1.452488e+07 1.370822e+07 1.475821e+07 1.510821e+07 1.609987e+07
116 1.550000e+12 1.670000e+12 1.820000e+12 1.990000e+12 2.160000e+12
117 8.497997e+09 9.423396e+09 1.037400e+10 1.133440e+10 1.235500e+10
118 6.931429e+08 7.187143e+08 8.394286e+08 1.082857e+09 1.264286e+09
119 1.117602e+09 1.159512e+09 1.217138e+09 1.311436e+09 1.281750e+09
1967 ... 2011 2012 2013 \
0 3.017317e+10 ... 9.430000e+11 9.510000e+11 9.640000e+11
1 1.442643e+10 ... 6.710000e+11 7.280000e+11 8.210000e+11
2 3.044462e+10 ... 1.400000e+12 1.550000e+12 1.580000e+12
3 1.157943e+10 ... 4.310000e+11 4.090000e+11 4.300000e+11
4 1.782971e+08 ... 2.235821e+09 2.333308e+09 2.451625e+09
.. ... ... ... ... ...
115 1.583518e+07 ... 6.761296e+08 6.929333e+08 7.212074e+08
116 2.290000e+12 ... 7.370000e+13 7.530000e+13 7.740000e+13
117 1.377739e+10 ... 4.580000e+11 4.340000e+11 4.010000e+11
118 1.368000e+09 ... 2.345952e+10 2.550306e+10 2.803724e+10
119 1.397002e+09 ... 1.410192e+10 1.711485e+10 1.909102e+10
2014 2015 2016 2017 2018 \
0 9.850000e+11 9.200000e+11 8.730000e+11 9.850000e+11 1.010000e+12
1 8.650000e+11 7.610000e+11 6.910000e+11 6.840000e+11 7.420000e+11
2 1.470000e+12 1.350000e+12 1.210000e+12 1.330000e+12 1.430000e+12
3 4.420000e+11 3.820000e+11 3.960000e+11 4.160000e+11 4.550000e+11
4 2.705783e+09 3.104395e+09 2.732809e+09 2.748180e+09 2.668496e+09
.. ... ... ... ... ...
115 7.277148e+08 7.554000e+08 7.744296e+08 7.921778e+08 8.113000e+08
116 7.960000e+13 7.510000e+13 7.630000e+13 8.120000e+13 8.630000e+13
117 3.810000e+11 3.470000e+11 3.240000e+11 3.810000e+11 4.050000e+11
118 2.714102e+10 2.125122e+10 2.095841e+10 2.587360e+10 2.631159e+10
119 1.949552e+10 1.996312e+10 2.054868e+10 1.758489e+10 1.811554e+10
2019 2020
0 1.010000e+12 9.210000e+11
1 7.950000e+11 7.850000e+11
2 1.390000e+12 1.330000e+12
3 4.450000e+11 4.330000e+11
4 2.631434e+09 2.841786e+09
.. ... ...
115 8.250407e+08 8.074741e+08
116 8.760000e+13 8.470000e+13
117 3.880000e+11 3.350000e+11
118 2.330867e+10 1.811063e+10
119 1.928429e+10 1.805117e+10
[120 rows x 63 columns]
df4
-------------------------------------------------------------------------------------------------------------------------------------------------------
"LOCATION" INDICATOR SUBJECT MEASURE FREQUENCY TIME Value \
0 AUS HUR TOT PC_LF A 2000 6.285546
1 AUS HUR TOT PC_LF A 2001 6.742173
2 AUS HUR TOT PC_LF A 2002 6.368911
3 AUS HUR TOT PC_LF A 2003 5.928420
4 AUS HUR TOT PC_LF A 2004 5.396734
.. ... ... ... ... ... ... ...
626 CRI HUR TOT PC_LF A 2011 10.298480
627 CRI HUR TOT PC_LF A 2012 10.171750
628 CRI HUR TOT PC_LF A 2013 9.386163
629 CRI HUR TOT PC_LF A 2014 9.617385
630 CRI HUR TOT PC_LF A 2015 9.612973
Flag Codes
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
.. ...
626 NaN
627 NaN
628 NaN
629 NaN
630 NaN
[631 rows x 8 columns]
filtered_df.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
country 0
year 0
sex 0
age 0
suicides_no 0
population 0
suicides/100k pop 0
country-year 0
HDI for year 0
gdp_for_year ($) 0
gdp_per_capita ($) 0
generation 0
dtype: int64
df2_filtered.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
Entity 0
Year 0
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent) 0
dtype: int64
df3_filtered.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
Country Name Country Code 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
df4_filtered.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
LOCATION 0
TIME 0
Value 0
dtype: int64
filtered_df
-------------------------------------------------------------------------------------------------------------------------------------------------------
country year sex age suicides_no population \
132 Albania 2000 male 25-34 years 17 232000
133 Albania 2000 male 55-74 years 10 177400
134 Albania 2000 female 75+ years 2 37800
135 Albania 2000 male 75+ years 1 24900
136 Albania 2000 female 15-24 years 6 263900
... ... ... ... ... ... ...
27815 Uzbekistan 2014 female 35-54 years 107 3620833
27816 Uzbekistan 2014 female 75+ years 9 348465
27817 Uzbekistan 2014 male 5-14 years 60 2762158
27818 Uzbekistan 2014 female 5-14 years 44 2631600
27819 Uzbekistan 2014 female 55-74 years 21 1438935
suicides/100k pop country-year HDI for year gdp_for_year ($) \
132 7.33 Albania2000 0.656 3.632044e+09
133 5.64 Albania2000 0.656 3.632044e+09
134 5.29 Albania2000 0.656 3.632044e+09
135 4.02 Albania2000 0.656 3.632044e+09
136 2.27 Albania2000 0.656 3.632044e+09
... ... ... ... ...
27815 2.96 Uzbekistan2014 0.675 6.306708e+10
27816 2.58 Uzbekistan2014 0.675 6.306708e+10
27817 2.17 Uzbekistan2014 0.675 6.306708e+10
27818 1.67 Uzbekistan2014 0.675 6.306708e+10
27819 1.46 Uzbekistan2014 0.675 6.306708e+10
gdp_per_capita ($) generation
132 1299 Generation X
133 1299 Silent
134 1299 G.I. Generation
135 1299 G.I. Generation
136 1299 Generation X
... ... ...
27815 2309 Generation X
27816 2309 Silent
27817 2309 Generation Z
27818 2309 Generation Z
27819 2309 Boomers
[16008 rows x 12 columns]
df2_filtered
-------------------------------------------------------------------------------------------------------------------------------------------------------
Entity Year \
10 Afghanistan 2000
11 Afghanistan 2001
12 Afghanistan 2002
13 Afghanistan 2003
14 Afghanistan 2004
... ... ...
6831 Zimbabwe 2011
6832 Zimbabwe 2012
6833 Zimbabwe 2013
6834 Zimbabwe 2014
6835 Zimbabwe 2015
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)
10 4.79
11 4.79
12 4.79
13 4.79
14 4.79
... ...
6831 3.29
6832 3.30
6833 3.30
6834 3.31
6835 3.32
[3648 rows x 3 columns]
df3_filtered
-------------------------------------------------------------------------------------------------------------------------------------------------------
Country Name Country Code 2000 2001 \
0 Africa Eastern and Southern AFE 2.840000e+11 2.590000e+11
1 Africa Western and Central AFW 1.400000e+11 1.480000e+11
2 Australia AUS 4.160000e+11 3.790000e+11
3 Austria AUT 1.970000e+11 1.970000e+11
4 Burundi BDI 8.704861e+08 8.767947e+08
.. ... ... ... ...
115 St. Vincent and the Grenadines VCT 3.962630e+08 4.300407e+08
116 World WLD 3.380000e+13 3.360000e+13
117 South Africa ZAF 1.520000e+11 1.350000e+11
118 Zambia ZMB 3.600683e+09 4.094481e+09
119 Zimbabwe ZWE 6.689958e+09 6.777385e+09
2002 2003 2004 2005 2006 \
0 2.650000e+11 3.530000e+11 4.390000e+11 5.120000e+11 5.760000e+11
1 1.770000e+11 2.050000e+11 2.540000e+11 3.110000e+11 3.930000e+11
2 3.950000e+11 4.670000e+11 6.140000e+11 6.950000e+11 7.480000e+11
3 2.130000e+11 2.620000e+11 3.010000e+11 3.160000e+11 3.360000e+11
4 8.253945e+08 7.846544e+08 9.152573e+08 1.117113e+09 1.273375e+09
.. ... ... ... ... ...
115 4.618852e+08 4.818074e+08 5.219741e+08 5.507296e+08 6.109296e+08
116 3.490000e+13 3.910000e+13 4.410000e+13 4.780000e+13 5.180000e+13
117 1.290000e+11 1.970000e+11 2.560000e+11 2.890000e+11 3.040000e+11
118 4.193846e+09 4.901840e+09 6.221078e+09 8.331870e+09 1.275686e+10
119 6.342116e+09 5.727592e+09 5.805598e+09 5.755215e+09 5.443896e+09
2007 2008 2009 2010 2011 \
0 6.610000e+11 7.080000e+11 7.130000e+11 8.470000e+11 9.430000e+11
1 4.620000e+11 5.660000e+11 5.070000e+11 5.920000e+11 6.710000e+11
2 8.540000e+11 1.060000e+12 9.280000e+11 1.150000e+12 1.400000e+12
3 3.890000e+11 4.300000e+11 4.000000e+11 3.920000e+11 4.310000e+11
4 1.356199e+09 1.611836e+09 1.781455e+09 2.032135e+09 2.235821e+09
.. ... ... ... ... ...
115 6.844444e+08 6.954296e+08 6.749222e+08 6.812259e+08 6.761296e+08
116 5.830000e+13 6.400000e+13 6.070000e+13 6.650000e+13 7.370000e+13
117 3.330000e+11 3.160000e+11 3.300000e+11 4.170000e+11 4.580000e+11
118 1.405696e+10 1.791086e+10 1.532834e+10 2.026556e+10 2.345952e+10
119 5.291950e+09 4.415703e+09 9.665793e+09 1.204166e+10 1.410192e+10
2012 2013 2014 2015
0 9.510000e+11 9.640000e+11 9.850000e+11 9.200000e+11
1 7.280000e+11 8.210000e+11 8.650000e+11 7.610000e+11
2 1.550000e+12 1.580000e+12 1.470000e+12 1.350000e+12
3 4.090000e+11 4.300000e+11 4.420000e+11 3.820000e+11
4 2.333308e+09 2.451625e+09 2.705783e+09 3.104395e+09
.. ... ... ... ...
115 6.929333e+08 7.212074e+08 7.277148e+08 7.554000e+08
116 7.530000e+13 7.740000e+13 7.960000e+13 7.510000e+13
117 4.340000e+11 4.010000e+11 3.810000e+11 3.470000e+11
118 2.550306e+10 2.803724e+10 2.714102e+10 2.125122e+10
119 1.711485e+10 1.909102e+10 1.949552e+10 1.996312e+10
[120 rows x 18 columns]
df4_filtered
-------------------------------------------------------------------------------------------------------------------------------------------------------
LOCATION TIME Value
0 AUS 2000 6.285546
1 AUS 2001 6.742173
2 AUS 2002 6.368911
3 AUS 2003 5.928420
4 AUS 2004 5.396734
.. ... ... ...
626 CRI 2011 10.298480
627 CRI 2012 10.171750
628 CRI 2013 9.386163
629 CRI 2014 9.617385
630 CRI 2015 9.612973
[631 rows x 3 columns]
average_suicide_rates_by_country = filtered_df.groupby('country')['suicides/100k pop'].mean().sort_values(ascending=False)
-------------------------------------------------------------------------------------------------------------------------------------------------------
country
Lithuania 38.015208
Republic of Korea 35.543646
Russian Federation 31.338229
Guyana 30.191667
Sri Lanka 30.104000
...
Antigua and Barbuda 0.874405
Barbados 0.834881
Oman 0.736111
Jamaica 0.688583
Kiribati 0.000000
Name: suicides/100k pop, Length: 97, dtype: float64
average_suicide_rates_by_country_and_year = filtered_df.groupby(['year','country'])['suicides/100k pop'].mean().sort_values(ascending=False)
-------------------------------------------------------------------------------------------------------------------------------------------------------
year country
2000 Albania 2.558333
Antigua and Barbuda 3.330833
Argentina 10.949167
Armenia 2.858333
Aruba 25.444167
...
2015 Turkmenistan 2.373333
Ukraine 20.393333
United Kingdom 7.228333
United States 14.617500
Uruguay 22.501667
Name: suicides/100k pop, Length: 1334, dtype: float64
average_suicide_rates_by_gender = filtered_df.groupby('sex')['suicides/100k pop'].mean()
-------------------------------------------------------------------------------------------------------------------------------------------------------
sex
female 4.901246
male 19.299390
Name: suicides/100k pop, dtype: float64
average_suicide_rates_by_gender_and_year = filtered_df.groupby(['year','sex'])['suicides/100k pop'].mean()
-------------------------------------------------------------------------------------------------------------------------------------------------------
year sex
2000 female 5.783004
male 22.099651
2001 female 5.491932
male 21.546345
2002 female 5.606105
male 21.966996
2003 female 5.302384
male 21.107655
2004 female 5.054127
male 19.909762
2005 female 5.007599
male 19.129286
2006 female 4.769059
male 19.085863
2007 female 4.994012
male 19.056667
2008 female 4.979412
male 18.835961
2009 female 4.528240
male 18.273333
2010 female 4.406117
male 18.025682
2011 female 4.344322
male 17.686376
2012 female 4.521626
male 18.321811
2013 female 4.366208
male 17.849708
2014 female 4.458803
male 17.564124
2015 female 4.653468
male 17.534677
Name: suicides/100k pop, dtype: float64
average_suicide_rates_by_age = filtered_df.groupby('age')['suicides/100k pop'].mean().sort_values(ascending=False)
-------------------------------------------------------------------------------------------------------------------------------------------------------
age
75+ years 22.080315
55-74 years 15.342031
35-54 years 14.420795
25-34 years 11.625240
15-24 years 8.504663
5-14 years 0.628864
Name: suicides/100k pop, dtype: float64
average_suicide_rates_by_age_and_year = filtered_df.groupby(['year','age'])['suicides/100k pop'].mean().sort_values(ascending=False)
-------------------------------------------------------------------------------------------------------------------------------------------------------
year age
2000 15-24 years 9.442733
25-34 years 13.756919
35-54 years 16.443372
5-14 years 0.532442
55-74 years 17.531279
...
2015 25-34 years 10.111774
35-54 years 12.555645
5-14 years 0.704677
55-74 years 14.451774
75+ years 20.967339
Name: suicides/100k pop, Length: 96, dtype: float64
correlations = filtered_df[['suicides/100k pop', 'gdp_per_capita ($)', ' gdp_for_year ($) ']].corr()
-------------------------------------------------------------------------------------------------------------------------------------------------------
suicides/100k pop gdp_per_capita ($) gdp_for_year ($)
suicides/100k pop 1.000000 -0.010388 0.026922
gdp_per_capita ($) -0.010388 1.000000 0.271639
gdp_for_year ($) 0.026922 0.271639 1.000000
average_suicide_rates_by_generation = filtered_df.groupby('generation')['suicides/100k pop'].mean().sort_values(ascending=False)
-------------------------------------------------------------------------------------------------------------------------------------------------------
generation
G.I. Generation 25.941221
Silent 19.236765
Boomers 14.789173
Generation X 12.256906
Millenials 6.720002
Generation Z 0.642299
Name: suicides/100k pop, dtype: float64
average_suicide_rates_by_generation_and_year = filtered_df.groupby(['year', 'generation'])['suicides/100k pop'].mean().sort_values(ascending=False)
-------------------------------------------------------------------------------------------------------------------------------------------------------
year generation
2000 Boomers 16.443372
G.I. Generation 25.941221
Generation X 11.599826
Millenials 0.532442
Silent 17.531279
...
2015 Boomers 14.451774
Generation X 12.555645
Generation Z 0.704677
Millenials 8.942500
Silent 20.967339
Name: suicides/100k pop, Length: 73, dtype: float64
average_suicide_rates_by_year = filtered_df.groupby('year')['suicides/100k pop'].mean()
-------------------------------------------------------------------------------------------------------------------------------------------------------
year
2000 13.941328
2001 13.519138
2002 13.786550
2003 13.205019
2004 12.481944
2005 12.068442
2006 11.927461
2007 12.025339
2008 11.907686
2009 11.400787
2010 11.215900
2011 11.015349
2012 11.421718
2013 11.107958
2014 11.011464
2015 11.094073
Name: suicides/100k pop, dtype: float64
prevalence_by_country = df2_filtered.groupby(['Entity', 'Year'])['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()
-------------------------------------------------------------------------------------------------------------------------------------------------------
Entity Year
Afghanistan 2000 4.79
2001 4.79
2002 4.79
2003 4.79
2004 4.79
...
Zimbabwe 2011 3.29
2012 3.30
2013 3.30
2014 3.31
2015 3.32
Name: Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent), Length: 3648, dtype: float64
average_prevalence_by_country = df2_filtered.groupby('Entity')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()
-------------------------------------------------------------------------------------------------------------------------------------------------------
Entity
Afghanistan 4.841875
African Region (WHO) 3.537500
Albania 3.990625
Algeria 4.918125
American Samoa 4.193750
...
World Bank Lower Middle Income 3.293125
World Bank Upper Middle Income 4.211875
Yemen 4.914375
Zambia 3.881250
Zimbabwe 3.303125
Name: Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent), Length: 228, dtype: float64
mean_prevalence = df2_filtered.groupby('Year')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()
-------------------------------------------------------------------------------------------------------------------------------------------------------
Year
2000 4.305789
2001 4.309298
2002 4.311798
2003 4.313684
2004 4.315526
2005 4.317544
2006 4.323596
2007 4.334781
2008 4.348816
2009 4.361491
2010 4.369123
2011 4.373772
2012 4.378772
2013 4.383860
2014 4.389649
2015 4.394605
Name: Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent), dtype: float64
median_prevalence = df2_filtered.groupby('Year')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].median()
-------------------------------------------------------------------------------------------------------------------------------------------------------
Year
2000 4.055
2001 4.055
2002 4.055
2003 4.055
2004 4.055
2005 4.055
2006 4.070
2007 4.090
2008 4.115
2009 4.130
2010 4.140
2011 4.150
2012 4.160
2013 4.170
2014 4.180
2015 4.180
Name: Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent), dtype: float64
total_gdp = df3_filtered.iloc[:, 2:].sum(axis=0)
-------------------------------------------------------------------------------------------------------------------------------------------------------
2000 2.160246e+14
2001 2.151838e+14
2002 2.233870e+14
2003 2.505006e+14
2004 2.831420e+14
2005 3.095809e+14
2006 3.389768e+14
2007 3.857718e+14
2008 4.279962e+14
2009 4.111343e+14
2010 4.573194e+14
2011 5.119564e+14
2012 5.284604e+14
2013 5.477950e+14
2014 5.653896e+14
2015 5.399171e+14
dtype: float64
average_gdp = df3_filtered.iloc[:, 2:].mean(axis=0)
-------------------------------------------------------------------------------------------------------------------------------------------------------
2000 1.800205e+12
2001 1.793198e+12
2002 1.861558e+12
2003 2.087505e+12
2004 2.359517e+12
2005 2.579841e+12
2006 2.824807e+12
2007 3.214765e+12
2008 3.566635e+12
2009 3.426120e+12
2010 3.810995e+12
2011 4.266303e+12
2012 4.403837e+12
2013 4.564958e+12
2014 4.711580e+12
2015 4.499309e+12
dtype: float64
mean_unemployment = df4_filtered.groupby('TIME')['Value'].mean()
-------------------------------------------------------------------------------------------------------------------------------------------------------
TIME
2000 7.901248
2001 7.765694
2002 7.832388
2003 7.879029
2004 7.936171
2005 7.560420
2006 6.812503
2007 6.196204
2008 6.307551
2009 8.764485
2010 9.282159
2011 8.937410
2012 9.230890
2013 9.234676
2014 8.744107
2015 8.165753
Name: Value, dtype: float64
median_unemployment = df4_filtered.groupby('TIME')['Value'].median()
-------------------------------------------------------------------------------------------------------------------------------------------------------
TIME
2000 6.779167
2001 6.666920
2002 6.886809
2003 7.575000
2004 7.375000
2005 7.783333
2006 6.466667
2007 5.966615
2008 6.239756
2009 8.219090
2010 8.375000
2011 8.045834
2012 8.020834
2013 8.200000
2014 7.456700
2015 6.915025
Name: Value, dtype: float64
<ipython-input-1-29986ce04bff>:336: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df3_filtered.rename(columns={'Country Name': 'country', 'Country Code': 'country_code'}, inplace=True)
merged_df = filtered_df.merge(df2_filtered, on=['country', 'year'], how='left')
-------------------------------------------------------------------------------------------------------------------------------------------------------
country year sex age suicides_no population \
0 Albania 2000 male 25-34 years 17 232000
1 Albania 2000 male 55-74 years 10 177400
2 Albania 2000 female 75+ years 2 37800
3 Albania 2000 male 75+ years 1 24900
4 Albania 2000 female 15-24 years 6 263900
... ... ... ... ... ... ...
16003 Uzbekistan 2014 female 35-54 years 107 3620833
16004 Uzbekistan 2014 female 75+ years 9 348465
16005 Uzbekistan 2014 male 5-14 years 60 2762158
16006 Uzbekistan 2014 female 5-14 years 44 2631600
16007 Uzbekistan 2014 female 55-74 years 21 1438935
suicides/100k pop country-year HDI for year gdp_for_year ($) \
0 7.33 Albania2000 0.656 3.632044e+09
1 5.64 Albania2000 0.656 3.632044e+09
2 5.29 Albania2000 0.656 3.632044e+09
3 4.02 Albania2000 0.656 3.632044e+09
4 2.27 Albania2000 0.656 3.632044e+09
... ... ... ... ...
16003 2.96 Uzbekistan2014 0.675 6.306708e+10
16004 2.58 Uzbekistan2014 0.675 6.306708e+10
16005 2.17 Uzbekistan2014 0.675 6.306708e+10
16006 1.67 Uzbekistan2014 0.675 6.306708e+10
16007 1.46 Uzbekistan2014 0.675 6.306708e+10
gdp_per_capita ($) generation \
0 1299 Generation X
1 1299 Silent
2 1299 G.I. Generation
3 1299 G.I. Generation
4 1299 Generation X
... ... ...
16003 2309 Generation X
16004 2309 Silent
16005 2309 Generation Z
16006 2309 Generation Z
16007 2309 Boomers
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)
0 3.93
1 3.93
2 3.93
3 3.93
4 3.93
... ...
16003 2.12
16004 2.12
16005 2.12
16006 2.12
16007 2.12
[16008 rows x 13 columns]
merged_df = merged_df.merge(df3_long, on=['country', 'year'], how='left')
-------------------------------------------------------------------------------------------------------------------------------------------------------
country year sex age suicides_no population \
0 Albania 2000 male 25-34 years 17 232000
1 Albania 2000 male 55-74 years 10 177400
2 Albania 2000 female 75+ years 2 37800
3 Albania 2000 male 75+ years 1 24900
4 Albania 2000 female 15-24 years 6 263900
... ... ... ... ... ... ...
16003 Uzbekistan 2014 female 35-54 years 107 3620833
16004 Uzbekistan 2014 female 75+ years 9 348465
16005 Uzbekistan 2014 male 5-14 years 60 2762158
16006 Uzbekistan 2014 female 5-14 years 44 2631600
16007 Uzbekistan 2014 female 55-74 years 21 1438935
suicides/100k pop country-year HDI for year gdp_for_year ($) \
0 7.33 Albania2000 0.656 3.632044e+09
1 5.64 Albania2000 0.656 3.632044e+09
2 5.29 Albania2000 0.656 3.632044e+09
3 4.02 Albania2000 0.656 3.632044e+09
4 2.27 Albania2000 0.656 3.632044e+09
... ... ... ... ...
16003 2.96 Uzbekistan2014 0.675 6.306708e+10
16004 2.58 Uzbekistan2014 0.675 6.306708e+10
16005 2.17 Uzbekistan2014 0.675 6.306708e+10
16006 1.67 Uzbekistan2014 0.675 6.306708e+10
16007 1.46 Uzbekistan2014 0.675 6.306708e+10
gdp_per_capita ($) generation \
0 1299 Generation X
1 1299 Silent
2 1299 G.I. Generation
3 1299 G.I. Generation
4 1299 Generation X
... ... ...
16003 2309 Generation X
16004 2309 Silent
16005 2309 Generation Z
16006 2309 Generation Z
16007 2309 Boomers
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent) \
0 3.93
1 3.93
2 3.93
3 3.93
4 3.93
... ...
16003 2.12
16004 2.12
16005 2.12
16006 2.12
16007 2.12
country_code gdp
0 NaN NaN
1 NaN NaN
2 NaN NaN
3 NaN NaN
4 NaN NaN
... ... ...
16003 NaN NaN
16004 NaN NaN
16005 NaN NaN
16006 NaN NaN
16007 NaN NaN
[16008 rows x 15 columns]
merged_df = merged_df.merge(df4_filtered, on=['country', 'year'], how='left')
-------------------------------------------------------------------------------------------------------------------------------------------------------
country year sex age suicides_no population \
804 Australia 2000 male 25-34 years 466 1430700
805 Australia 2000 male 75+ years 115 416077
806 Australia 2000 male 35-54 years 745 2769752
807 Australia 2000 male 15-24 years 271 1333011
808 Australia 2000 male 55-74 years 281 1522620
... ... ... ... ... ... ...
15679 United States 2015 female 25-34 years 1444 21555712
15680 United States 2015 female 15-24 years 1132 21633813
15681 United States 2015 female 75+ years 540 11778666
15682 United States 2015 male 5-14 years 255 21273987
15683 United States 2015 female 5-14 years 158 20342901
suicides/100k pop gdp_for_year ($) gdp_per_capita ($) \
804 32.57 4.150342e+11 23219
805 27.64 4.150342e+11 23219
806 26.90 4.150342e+11 23219
807 20.33 4.150342e+11 23219
808 18.46 4.150342e+11 23219
... ... ... ...
15679 6.70 1.812071e+13 60387
15680 5.23 1.812071e+13 60387
15681 4.58 1.812071e+13 60387
15682 1.20 1.812071e+13 60387
15683 0.78 1.812071e+13 60387
generation \
804 Generation X
805 G.I. Generation
806 Boomers
807 Generation X
808 Silent
... ...
15679 Millenials
15680 Millenials
15681 Silent
15682 Generation Z
15683 Generation Z
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent) \
804 5.61
805 5.61
806 5.61
807 5.61
808 5.61
... ...
15679 5.61
15680 5.61
15681 5.61
15682 5.61
15683 5.61
value
804 6.285546
805 6.285546
806 6.285546
807 6.285546
808 6.285546
... ...
15679 5.291667
15680 5.291667
15681 5.291667
15682 5.291667
15683 5.291667
[4128 rows x 12 columns]
merged_df.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
country 0
year 0
sex 0
age 0
suicides_no 0
population 0
suicides/100k pop 0
gdp_for_year ($) 0
gdp_per_capita ($) 0
generation 0
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent) 0
value 0
dtype: int64
merged_df.describe(include='all')
-------------------------------------------------------------------------------------------------------------------------------------------------------
country year sex age suicides_no population \
count 4128 4128.000000 4128 4128 4128.000000 4.128000e+03
unique 24 NaN 2 6 NaN NaN
top Italy NaN male 25-34 years NaN NaN
freq 192 NaN 2064 688 NaN NaN
mean NaN 2007.720930 NaN NaN 424.522771 3.507311e+06
std NaN 4.558069 NaN NaN 1162.719207 5.880734e+06
min NaN 2000.000000 NaN NaN 0.000000 6.532000e+03
25% NaN 2004.000000 NaN NaN 17.000000 4.877772e+05
50% NaN 2008.000000 NaN NaN 88.000000 1.277342e+06
75% NaN 2012.000000 NaN NaN 290.000000 3.959502e+06
max NaN 2015.000000 NaN NaN 11767.000000 4.380521e+07
suicides/100k pop gdp_for_year ($) gdp_per_capita ($) generation \
count 4128.000000 4.128000e+03 4128.000000 4128
unique NaN NaN NaN 6
top NaN NaN NaN Millenials
freq NaN NaN NaN 1156
mean 11.867384 1.563796e+12 40700.299419 NaN
std 13.023472 3.035203e+12 23309.800208 NaN
min 0.000000 1.131644e+10 4866.000000 NaN
25% 2.340000 2.382038e+11 24759.000000 NaN
50% 7.670000 4.880970e+11 40347.500000 NaN
75% 17.502500 1.464961e+12 51772.000000 NaN
max 99.840000 1.812071e+13 126352.000000 NaN
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent) \
count 4128.000000
unique NaN
top NaN
freq NaN
mean 5.677878
std 1.341483
min 2.560000
25% 4.730000
50% 5.830000
75% 6.482500
max 8.780000
value
count 4128.000000
unique NaN
top NaN
freq NaN
mean 7.641546
std 4.035549
min 1.900000
25% 5.031250
50% 6.870833
75% 9.008333
max 27.825000
grouped_data = merged_df.groupby(['country', 'year']).agg
-------------------------------------------------------------------------------------------------------------------------------------------------------
country year suicides_no population gdp_for_year ($) \
0 Australia 2000 2391 17874410 4.150342e+11
1 Australia 2001 2458 18130883 3.782151e+11
2 Australia 2002 2319 18370058 3.944867e+11
3 Australia 2003 2156 18608029 4.662947e+11
4 Australia 2004 2114 18854551 6.119043e+11
.. ... ... ... ... ...
339 United States 2011 39508 290313825 1.551793e+13
340 United States 2012 40596 292827128 1.615526e+13
341 United States 2013 41143 295322862 1.669152e+13
342 United States 2014 42769 297749735 1.742761e+13
343 United States 2015 44189 300078511 1.812071e+13
gdp_per_capita ($) Anxiety Prevalence (%) Unemployment Rate(%) \
0 23219 5.61 6.285546
1 20860 5.66 6.742173
2 21474 5.78 6.368911
3 25059 5.92 5.928420
4 32454 6.04 5.396734
.. ... ... ...
339 53452 6.22 8.950000
340 55170 6.03 8.066667
341 56520 5.84 7.375000
342 58531 5.68 6.166667
343 60387 5.61 5.291667
suicides/100k pop
0 13.376665
1 13.556979
2 12.623803
3 11.586396
4 11.212147
.. ...
339 13.608722
340 13.863470
341 13.931532
342 14.364077
343 14.725813
[344 rows x 9 columns]
grouped_data.describe(include='all')
-------------------------------------------------------------------------------------------------------------------------------------------------------
country year suicides_no population gdp_for_year ($) \
count 344 344.000000 344.000000 3.440000e+02 3.440000e+02
unique 24 NaN NaN NaN NaN
top Italy NaN NaN NaN NaN
freq 16 NaN NaN NaN NaN
mean NaN 2007.720930 5094.273256 4.208773e+07 1.563796e+12
std NaN 4.564155 9151.842382 6.224772e+07 3.039256e+12
min NaN 2000.000000 26.000000 2.683300e+05 1.131644e+10
25% NaN 2004.000000 905.750000 7.858817e+06 2.382038e+11
50% NaN 2008.000000 1844.500000 1.542053e+07 4.880970e+11
75% NaN 2012.000000 3980.750000 5.648715e+07 1.464961e+12
max NaN 2015.000000 44189.000000 3.000785e+08 1.812071e+13
gdp_per_capita ($) Anxiety Prevalence (%) Unemployment Rate(%) \
count 344.000000 344.000000 344.000000
unique NaN NaN NaN
top NaN NaN NaN
freq NaN NaN NaN
mean 40700.299419 5.677878 7.641546
std 23340.927117 1.343274 4.040938
min 4866.000000 2.560000 1.900000
25% 24759.000000 4.730000 5.031250
50% 40347.500000 5.830000 6.870833
75% 51772.000000 6.482500 9.008333
max 126352.000000 8.780000 27.825000
suicides/100k pop
count 344.000000
unique NaN
top NaN
freq NaN
mean 12.036260
std 5.413287
min 1.591815
25% 7.764163
50% 11.738373
75% 14.491590
max 26.480336
14068750000000.0 14450000000000.0 2398363700004.65 3650000000000.0
count 16.00 mean 6.48 std 0.52 min 5.61 25% 6.17 50% 6.49 75% 6.82 max 7.31 Name: Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent), dtype: float64 6.485
count 16008.00 mean 237.05 std 884.03 min 0.00 25% 2.00 50% 23.00 75% 120.00 max 21262.00 Name: suicides_no, dtype: float64 23.0
count 180.00 mean 6.01 std 0.18 min 5.61 25% 5.92 50% 6.08 75% 6.14 max 6.16 Name: Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent), dtype: float64 6.045
<ipython-input-1-29986ce04bff>:611: UserWarning: FixedFormatter should only be used together with FixedLocator axs[1, 0].set_xticklabels(average_suicide_rates_by_age.index, rotation=45, ha='right') <ipython-input-1-29986ce04bff>:620: UserWarning: FixedFormatter should only be used together with FixedLocator axs[1, 1].set_xticklabels(average_suicide_rates_by_generation.index, rotation=45, ha='right')
InĀ [36]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from statsmodels.tsa.arima.model import ARIMA
from pandas.tseries.offsets import DateOffset
import warnings
warnings.filterwarnings('ignore')
# Assuming your data is in a pandas DataFrame called "data"
# Calculate correlation matrix
# Display correlation matrix
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
# ------------------------------
df = pd.read_csv("transformed_data.csv")
# ------------------------------
def predict_future(df, independent_var):
# Group the data by year and calculate the mean suicide rate for the given independent variable
df_yearly = df.groupby('year')[independent_var].mean().reset_index()
# Set the year as the index
df_yearly.set_index('year', inplace=True)
df_yearly.index = pd.to_datetime(df_yearly.index, format='%Y')
# Determine the order of the ARIMA model (p, d, q)
# You may need to experiment with different values or use an automated approach
p, d, q = 1, 1, 1
# Fit the ARIMA model
model = ARIMA(df_yearly, order=(p, d, q))
model_fit = model.fit()
# Make predictions for the next 5 years
forecast_years = 5
forecast = model_fit.forecast(steps=forecast_years)
# Plot the historical data and the predictions
plt.figure(figsize=(12, 6))
plt.plot(df_yearly.index, df_yearly[independent_var], label='Historical Data')
plt.plot(np.arange(df_yearly.index[-1]+1, df_yearly.index[-1]+1+forecast_years), forecast, label='Predictions', linestyle='--', color='red')
plt.xlabel('Year')
plt.ylabel(independent_var)
plt.title(f'{independent_var} : Historical Data and Predictions')
plt.legend()
plt.show()
return forecast
def psk(x,y):
Pearson = scipy.stats.pearsonr(x,y)[0]
Spearmanr = scipy.stats.spearmanr(x,y)[0]
Kendall = scipy.stats.kendalltau(x,y)[0]
print("Pearson: " + str(Pearson))
print("Spearmanr: " + str(Spearmanr))
print("Kendall: " + str(Kendall))
def visual(x,y):
slope, intercept, r, p, stderr = scipy.stats.linregress(x,y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x,r={r:.2f}'
fig, ax = plt.subplots()
ax.plot(x,y, linewidth=0, marker='s', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.legend(facecolor='white',loc='upper left')
plt.show
def data_analytics(val, country):
filtered_data = grouped_data[grouped_data['country'] == country]
# Preparing data for linear regression
X = filtered_data[val]
y = filtered_data['suicides/100k pop']
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Fitting the linear regression model
model = LinearRegression()
X_train_reshaped = X_train.values.reshape(-1, 1)
y_train_reshaped = y_train.values.reshape(-1, 1)
X_test_reshaped = X_test.values.reshape(-1, 1)
model.fit(X_train_reshaped, y_train_reshaped)
# Making predictions
y_pred = model.predict(X_test_reshaped)
# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
df_preds = pd.DataFrame({'Actual': y_test.squeeze(), 'Predicted': y_pred.squeeze()})
print(f'Current country being analyzed is {country} and the current independent variable is {val}')
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df_preds.reset_index())
print()
print()
print()
print()
print("Mean Squared Error")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(mse)
print()
print()
print()
print()
print("R-squared")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(r2)
print()
print()
print()
print()
plt.scatter(X_train, y_train,color='b')
plt.plot(X_test, y_pred,color='k')
plt.show()
plt.figure(figsize=(12, 6))
plt.scatter(df_preds.index, df_preds['Actual'], label='Actual', alpha=0.7)
plt.scatter(df_preds.index, df_preds['Predicted'], label='Predicted', alpha=0.7)
plt.xlabel("Index")
plt.ylabel("Suicides/100k pop")
plt.title("Actual vs Predicted Suicide Rates")
plt.legend()
plt.show()
residuals = df_preds['Actual'] - df_preds['Predicted']
plt.figure(figsize=(12, 6))
plt.scatter(df_preds.index, residuals, alpha=0.7)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Index")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.show()
#sns.pairplot(filtered_data[['suicides/100k pop', 'year', 'gdp_per_capita ($)', 'Anxiety Prevalence (%)']])
plt.show()
df_yearly = df.groupby('year')['suicides/100k pop'].mean().reset_index()
df_yearly.to_csv('df_yearly.csv', index=False)
# Set the year as the index
df_yearly.set_index('year', inplace=True)
df_yearly.index = pd.to_datetime(df_yearly.index, format='%Y')
# Determine the order of the ARIMA model (p, d, q)
# You may need to experiment with different values or use an automated approach
p, d, q = 1, 1, 1
# Fit the ARIMA model
model = ARIMA(df_yearly, order=(p, d, q))
model_fit = model.fit()
# Make predictions for the next 5 years
forecast_years = 5
forecast = model_fit.forecast(steps=forecast_years)
# Create a new range of dates for the forecast period
future_dates = [df_yearly.index[-1] + DateOffset(years=x) for x in range(0, forecast_years + 1)]
independent_var = 'suicides/100k pop'
# Plot the historical data and the predictions
plt.figure(figsize=(12, 6))
plt.plot(df_yearly.index, df_yearly[independent_var], label='Historical Data')
plt.plot(future_dates[1:], forecast, label='Predictions', linestyle='--', color='red')
plt.xlabel('Year')
plt.ylabel(independent_var)
plt.title(f'{independent_var} : Historical Data and Predictions')
plt.legend()
plt.show()
def diff_country(country):
filtered_data = grouped_data[grouped_data['country'] == country]
correlation_matrix = filtered_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix")
plt.show()
data_analytics('year', country)
data_analytics('population', country)
data_analytics(' gdp_for_year ($) ', country)
data_analytics('gdp_per_capita ($)', country)
data_analytics('Anxiety Prevalence (%)', country)
data_analytics('Unemployment Rate(%)', country)
diff_country('United States')
Current country being analyzed is United States and the current independent variable is year ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 328 11.189108 10.869199 1 329 11.531207 11.111481 2 333 11.817534 12.080610 3 342 14.364077 14.261149 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.08957859509325357 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.9430894492071287
Current country being analyzed is United States and the current independent variable is population ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 328 11.189108 10.709737 1 329 11.531207 11.029651 2 333 11.817534 12.103507 3 342 14.364077 14.279855 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.14255706368942966 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.9094314774022294
Current country being analyzed is United States and the current independent variable is gdp_for_year ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 328 11.189108 10.983652 1 329 11.531207 11.136898 2 333 11.817534 12.260805 3 342 14.364077 14.231305 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.10295239130195934 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.9345928869688314
Current country being analyzed is United States and the current independent variable is gdp_per_capita ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 328 11.189108 11.000666 1 329 11.531207 11.131007 2 333 11.817534 12.337473 3 342 14.364077 14.147254 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.128254876924202 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.9185178593163941
Current country being analyzed is United States and the current independent variable is Anxiety Prevalence (%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 328 11.189108 10.891326 1 329 11.531207 11.023838 2 333 11.817534 12.348964 3 342 14.364077 14.491252 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.16117236601308638 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.8976049120568782
Current country being analyzed is United States and the current independent variable is Unemployment Rate(%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 328 11.189108 12.221629 1 329 11.531207 12.401542 2 333 11.817534 12.482402 3 342 14.364077 12.749239 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 1.218332705101512 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.22597596865506298
InĀ [37]:
diff_country('Austria')
Current country being analyzed is Austria and the current independent variable is year ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 15 20.916553 19.429268 1 16 19.503416 19.110184 2 20 17.872973 17.833847 3 29 16.140196 14.962088 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.9390296625179708 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.7056681316334978
Current country being analyzed is Austria and the current independent variable is population ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 15 20.916553 19.681935 1 16 19.503416 19.311358 2 20 17.872973 17.628855 3 29 16.140196 14.892694 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.7942559639203448 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.7510463714265913
Current country being analyzed is Austria and the current independent variable is gdp_for_year ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 15 20.916553 20.048981 1 16 19.503416 20.038469 2 20 17.872973 17.720943 3 29 16.140196 15.261313 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.45862834352714466 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.8562463544823377
Current country being analyzed is Austria and the current independent variable is gdp_per_capita ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 15 20.916553 20.007897 1 16 19.503416 20.019893 2 20 17.872973 17.666274 3 29 16.140196 15.411216 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.41663551940716836 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.8694086930905518
Current country being analyzed is Austria and the current independent variable is Anxiety Prevalence (%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 15 20.916553 19.124615 1 16 19.503416 19.124615 2 20 17.872973 18.023865 3 29 16.140196 15.088530 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 1.1208257877135357 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.6486854873929309
Current country being analyzed is Austria and the current independent variable is Unemployment Rate(%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 15 20.916553 18.350337 1 16 19.503416 18.290928 2 20 17.872973 15.855175 3 29 16.140196 15.855175 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 3.052084471995273 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.04334680690923731
InĀ [38]:
diff_country('Japan')
Current country being analyzed is Japan and the current independent variable is year ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 181 25.042450 27.365185 1 182 24.263322 26.952335 2 186 25.169590 25.300937 3 195 20.251260 21.585290 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 3.6056943884944896 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.10814504717915707
Current country being analyzed is Japan and the current independent variable is population ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 181 25.042450 23.620321 1 182 24.263322 23.732161 2 186 25.169590 23.943287 3 195 20.251260 23.806402 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 4.11186027159733 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- -0.01705318127710731
Current country being analyzed is Japan and the current independent variable is gdp_for_year ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 181 25.042450 24.001604 1 182 24.263322 24.327508 2 186 25.169590 24.075331 3 195 20.251260 24.022312 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 4.1264298180462315 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- -0.02065690382283103
Current country being analyzed is Japan and the current independent variable is gdp_per_capita ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 181 25.042450 23.983157 1 182 24.263322 24.326329 2 186 25.169590 24.079895 3 195 20.251260 24.016646 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 4.1229104272563095 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- -0.019786395740692964
Current country being analyzed is Japan and the current independent variable is Anxiety Prevalence (%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 181 25.042450 25.773251 1 182 24.263322 25.773251 2 186 25.169590 25.920150 3 195 20.251260 21.513178 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 1.242433543595893 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.6926887334260716
Current country being analyzed is Japan and the current independent variable is Unemployment Rate(%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 181 25.042450 24.404100 1 182 24.263322 25.048860 2 186 25.169590 23.810241 3 195 20.251260 22.113502 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 1.58508444911747 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.6079353200050406
InĀ [39]:
diff_country('United Kingdom')
Current country being analyzed is United Kingdom and the current independent variable is year ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 312 7.642439 6.933167 1 313 7.428202 6.988295 2 317 7.123868 7.208805 3 326 7.904841 7.704955 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.18593863411956554 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- -1.2651461468772602
Current country being analyzed is United Kingdom and the current independent variable is population ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 312 7.642439 7.092265 1 313 7.428202 7.005241 2 317 7.123868 7.196821 3 326 7.904841 7.779361 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.1256639272475629 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- -0.5308661481467192
Current country being analyzed is United Kingdom and the current independent variable is gdp_for_year ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 312 7.642439 7.262022 1 313 7.428202 7.258194 2 317 7.123868 7.388394 3 326 7.904841 7.461100 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.11012487826787923 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- -0.3415659680677705
Current country being analyzed is United Kingdom and the current independent variable is gdp_per_capita ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 312 7.642439 7.384043 1 313 7.428202 7.383937 2 317 7.123868 7.392880 3 326 7.904841 7.396137 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.09996869919375104 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- -0.2178411165559342
Current country being analyzed is United Kingdom and the current independent variable is Anxiety Prevalence (%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 312 7.642439 7.247641 1 313 7.428202 7.233386 2 317 7.123868 7.176367 3 326 7.904841 7.518482 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.08646215018678448 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- -0.05330130703433733
Current country being analyzed is United Kingdom and the current independent variable is Unemployment Rate(%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 312 7.642439 7.344251 1 313 7.428202 7.323643 2 317 7.123868 7.308531 3 326 7.904841 7.384092 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.1012819559018272 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- -0.23383950433717704
InĀ [40]:
diff_country('Finland')
Current country being analyzed is Finland and the current independent variable is year ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 90 23.864051 24.104406 1 91 24.576529 23.543363 2 95 20.033117 21.299190 3 104 15.290864 16.249801 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.9119262984916898 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.9330814073429998
Current country being analyzed is Finland and the current independent variable is population ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 90 23.864051 23.634458 1 91 24.576529 23.160271 2 95 20.033117 21.425179 3 104 15.290864 15.950330 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 1.1078078650689904 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.9187073084882107
Current country being analyzed is Finland and the current independent variable is gdp_for_year ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 90 23.864051 22.611968 1 91 24.576529 22.502004 2 95 20.033117 20.273641 3 104 15.290864 18.253119 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 3.67604439511432 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.7302460540149169
Current country being analyzed is Finland and the current independent variable is gdp_per_capita ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 90 23.864051 22.303295 1 91 24.576529 22.209655 2 95 20.033117 20.129226 3 104 15.290864 18.496498 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 4.580845725085966 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.6638503028056032
Current country being analyzed is Finland and the current independent variable is Anxiety Prevalence (%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 90 23.864051 22.342579 1 91 24.576529 22.342579 2 95 20.033117 21.809616 3 104 15.290864 17.332723 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 3.6576363594099246 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.73159686476021
Current country being analyzed is Finland and the current independent variable is Unemployment Rate(%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 90 23.864051 18.800548 1 91 24.576529 19.060503 2 95 20.033117 19.346782 3 104 15.290864 19.208578 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 17.971287067183003 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- -0.3187614402171941
InĀ [41]:
usa = df.loc[df['country'] == 'United States']
y = usa['suicides_no']
x = usa['Unemployment Rate(%)']
psk(x,y)
visual(x,y)
Pearson: 0.5229669784472624 Spearmanr: 0.5537561233777729 Kendall: 0.3025316904537665
InĀ [42]:
x = usa[' gdp_for_year ($) ']
psk(x,y)
visual(x,y)
Pearson: 0.9694943115190687 Spearmanr: 0.988235294117647 Kendall: 0.9500000000000001
InĀ [43]:
x = usa['population']
psk(x,y)
visual(x,y)
Pearson: 0.9856287427902889 Spearmanr: 0.9970588235294118 Kendall: 0.9833333333333333
InĀ [44]:
x = usa['Anxiety Prevalence (%)']
psk(x,y)
visual(x,y)
Pearson: -0.9728282957877852 Spearmanr: -0.9970588235294118 Kendall: -0.9833333333333333
InĀ [45]:
UK = df.loc[df['country'] == 'United Kingdom']
y = UK['suicides_no']
x = UK['Unemployment Rate(%)']
psk(x,y)
visual(x,y)
Pearson: 0.3784882272862713 Spearmanr: 0.6294117647058822 Kendall: 0.38333333333333336
InĀ [46]:
x = UK[' gdp_for_year ($) ']
psk(x,y)
visual(x,y)
Pearson: 0.3911777286517507 Spearmanr: 0.31176470588235294 Kendall: 0.25
InĀ [47]:
x = UK['population']
psk(x,y)
visual(x,y)
Pearson: 0.8574078006265712 Spearmanr: 0.7411764705882353 Kendall: 0.5666666666666667
InĀ [48]:
x = UK['Anxiety Prevalence (%)']
psk(x,y)
visual(x,y)
Pearson: 0.6762483923632745 Spearmanr: 0.7016317042084812 Kendall: 0.4852162225324867
InĀ [49]:
Austria = df.loc[df['country'] == 'Austria']
y = Austria['suicides_no']
x = Austria['Unemployment Rate(%)']
psk(x,y)
visual(x,y)
Pearson: -0.5903485553040041 Spearmanr: -0.3873347352349848 Kendall: -0.235302425908485
InĀ [50]:
x = Austria[' gdp_for_year ($) ']
psk(x,y)
visual(x,y)
Pearson: -0.9296965158168207 Spearmanr: -0.6764705882352942 Kendall: -0.5166666666666667
InĀ [51]:
x = Austria['population']
psk(x,y)
visual(x,y)
Pearson: -0.8643322853027127 Spearmanr: -0.7794117647058825 Kendall: -0.6333333333333333
InĀ [52]:
x = Austria['Anxiety Prevalence (%)']
psk(x,y)
visual(x,y)
Pearson: -0.8478652313303219 Spearmanr: -0.7770421529709435 Kendall: -0.6276205565667923
InĀ [53]:
Japan= df.loc[df['country'] == 'Japan']
y = Japan['suicides_no']
x = Japan['Unemployment Rate(%)']
psk(x,y)
visual(x,y)
Pearson: 0.6664818726262071 Spearmanr: 0.39882277169531377 Kendall: 0.32636268941473195
InĀ [54]:
x = Japan[' gdp_for_year ($) ']
psk(x,y)
visual(x,y)
Pearson: -0.1445703007592053 Spearmanr: -0.1647058823529412 Kendall: -0.13333333333333333
InĀ [55]:
x = Japan['population']
psk(x,y)
visual(x,y)
Pearson: 0.06208728311481601 Spearmanr: 0.06176470588235294 Kendall: 0.03333333333333333
InĀ [56]:
x = Japan['Anxiety Prevalence (%)']
psk(x,y)
visual(x,y)
Pearson: 0.8257599841197141 Spearmanr: 0.6879398467484049 Kendall: 0.5753670112546582
InĀ [57]:
Finland= df.loc[df['country'] == 'Finland']
y = Finland['suicides_no']
x = Finland['Unemployment Rate(%)']
psk(x,y)
visual(x,y)
Pearson: 0.09726113184467595 Spearmanr: 0.3294117647058824 Kendall: 0.25
InĀ [58]:
x = Finland[' gdp_for_year ($) ']
psk(x,y)
visual(x,y)
Pearson: -0.7187050203184171 Spearmanr: -0.7382352941176471 Kendall: -0.5833333333333334
InĀ [59]:
x = Finland['population']
psk(x,y)
visual(x,y)
Pearson: -0.9497484372918075 Spearmanr: -0.9529411764705882 Kendall: -0.8499999999999999
InĀ [60]:
x = Finland['Anxiety Prevalence (%)']
psk(x,y)
visual(x,y)
Pearson: -0.8413197138786257 Spearmanr: -0.9565240766177093 Kendall: -0.8692679567468138